#######################################
### OPTIMAL MATCHING on TRANSITIONS ### 
###       DISSIMILARITY MATRIX      ###
#######################################
# Labussiere, Levels and Vink 2021

# This script recodes sequences of states into sequences of transitions and
# calculates the optimal matching distance on the transition elements (OMTR)
# Input: dataset of individual sequences of states in wide format (not shareable)
#        matrix coding transitions for each sequence of states (see csv)
#        matrix of substitution costs (see csv)
#        vector of indel costs (see below)
# Output: the matrix with the individual sequences of transitions
#         the dissimilarity matrix resulting from OMTR

rm(list = ls())

# Libraries
library(TraMineR)
library(foreign)
library(RColorBrewer)


# INPUT DATASET
# Individuals in rows and states in columns
# 12,505 students in rows x 10 years (including start) in columns
mat_states <- read.dta(individual_states_wide)

nb_ind = nrow(mat_states) # number of individuals
len_obs = 10 # length of observation


## Build the transitions matrix
###############################################

# Input: the states-to-transitions matrix
# See also Table S4 in the Supplementary Materials 
path = "states_to_transitions_matrix.csv"
mat_states_to_transitions = read.csv(path, header = T, sep = ';',stringsAsFactors = FALSE)


# Initialisation: matrix of observed transitions
mat_transitions = matrix(0,nb_ind,len_obs-1)

# Loop to recode state elements into transition elements
row <- 1
while (row<(nb_ind + 1)) {
  column <- 1
  while (column<len_obs) {
    mat_transitions[row,column] = mat_states_to_transitions[mat_states[row,column],mat_states[row,column+1]]
    column <- column + 1
  }
  row <- row + 1
}

# OUTPUT: a matrix with the individual sequences of transitions
# Used as an input in clustering.do
write.csv(mat_transitions, "individual_transitions.csv")


## Build the transition costs matrix
###############################################

nb_transitions = max(mat_transitions)  # Number of transition categories 

# Input: the substitution cost matrix
# See also Table S5 in the Supplementary Materials 
# load and convert to numeric values
path = "substitution_costs_matrix.csv"
subs_costs = read.csv(path, header = F, skip=1, sep = ';',stringsAsFactors = FALSE)
mat_subs_costs = as.matrix(sapply(subs_costs, as.numeric), nb_transitions, nb_transitions)


## Indel costs vector
###############################################

# Indel costs vector
# See also Table S6 in the Supplementary Materials 
indel_costs <- c(0.1,2,3,1.5,2.5,1000,1000,1000)
# NB: excessive costs of 1000 are given for the start transition categories ("Start VMBO", "Start Bridge, "Start HAVO/VWO") 



## Run Optimal matching on transition elements
###############################################

# Free memory
mat_states <- NULL
mat_states_to_transitions <- NULL
gc()

OM_transitions <- seqdist(seqdef(mat_transitions), method="OM", norm=FALSE, indel=indel_costs, sm=mat_subs_costs, with.miss=FALSE, full.matrix=TRUE)	

# OUTPUT: dissimilarity matrix
# Used as an input in clustering.do
write.csv(OM_transitions, "dissimilarity_matrix_OMTR.csv")


